Making a new factor variable for elevators named elevatorF.
VIT2005 <- VIT2005 %>%
mutate(elevatorF = as.factor(elevator)) %>%
glimpse()
Rows: 218
Columns: 16
$ totalprice <dbl> 228000.0, 409000.0, 200000.0, 180000.0, 443600.0, 1730…
$ area <dbl> 75.31, 100.65, 88.87, 62.61, 146.15, 77.21, 77.04, 62.…
$ zone <fct> Z45, Z31, Z52, Z62, Z31, Z11, Z47, Z48, Z11, Z11, Z37,…
$ category <fct> 4B, 3B, 3A, 4A, 3A, 4B, 3A, 3B, 4B, 3B, 4B, 4B, 2B, 4B…
$ age <int> 33, 5, 14, 41, 22, 35, 14, 36, 37, 11, 36, 10, 7, 9, 1…
$ floor <int> 3, 7, 8, 3, 6, 4, 6, 3, 4, 5, 6, 3, 4, 2, 6, 4, 4, 4, …
$ rooms <int> 5, 5, 5, 4, 7, 5, 4, 4, 4, 4, 6, 5, 6, 5, 5, 4, 4, 5, …
$ out <fct> E100, E50, E50, E50, E100, E50, E50, E100, E25, E50, E…
$ conservation <fct> 2B, 1A, 1A, 2A, 1A, 1A, 1A, 3A, 2A, 1A, 2B, 1A, 1A, 1A…
$ toilets <int> 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, …
$ garage <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, …
$ elevator <int> 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, …
$ streetcategory <fct> S3, S5, S2, S3, S4, S4, S3, S3, S4, S4, S2, S3, S4, S4…
$ heating <fct> 3A, 4A, 3A, 1A, 4A, 3A, 4A, 3A, 3A, 3A, 4A, 3A, 3B, 3A…
$ storage <int> 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
$ elevatorF <fct> 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, …
totalprice versus area.ggplot(data = VIT2005, aes(x = area, y = totalprice)) +
geom_point() +
theme_bw() +
labs(x = "Area in square meters", y = "Price in Euros")
Figure 1.1: No this is Boo playing a joke
To refer to the Figure 1.1, use the syntax \@ref(fig:label).
cor().values <- VIT2005 %>%
mutate(x_xbar = area - mean(area), y_ybar = totalprice - mean(totalprice),
zx = x_xbar/sd(area), zy = y_ybar/sd(totalprice)) %>%
select(area, x_xbar, zx, totalprice, y_ybar, zy)
head(values)
area x_xbar zx totalprice y_ybar zy
1 75.31 -13.3928463 -0.645972464 228000 -52741.52 -0.7610779
2 100.65 11.9471576 0.576243069 409000 128258.48 1.8508128
3 88.87 0.1671589 0.008062516 200000 -80741.52 -1.1651273
4 62.61 -26.0928433 -1.258526968 180000 -100741.52 -1.4537340
5 146.15 57.4471500 2.770828263 443600 162858.48 2.3501024
6 77.21 -11.4928448 -0.554330357 173000 -107741.52 -1.5547463
values %>%
summarize(r = sum(zx*zy)/(length(zx) - 1))
r
1 0.8092125
#
# Built In Function
VIT2005 %>%
summarize(r = cor(totalprice, area))
r
1 0.8092125
mod1 by regressing totalprice on to area.mod1 <- lm(totalprice ~ area, data = VIT2005)
mod1
Call:
lm(formula = totalprice ~ area, data = VIT2005)
Coefficients:
(Intercept) area
40822 2705
\[\widehat{\text{totalprice}} = 4.0822416\times 10^{4} + 2704.7510279\cdot \text{area}\]
mod1 and mutate() to compute the \(\hat{y}\) and \(e_i\) values.values <- VIT2005 %>%
mutate(yhat = coef(mod1)[1] + coef(mod1)[2]*area,
e = totalprice - yhat) %>%
select(totalprice, yhat, area, e)
head(values)
totalprice yhat area e
1 228000 244517.2 75.31 -16517.209
2 409000 313055.6 100.65 95944.389
3 200000 281193.6 88.87 -81193.647
4 180000 210166.9 62.61 -30166.879
5 443600 436121.8 146.15 7478.238
6 173000 249656.2 77.21 -76656.240
# Or
values <- VIT2005 %>%
mutate(yhat = predict(mod1), e = resid(mod1)) %>%
select(totalprice, yhat, area, e)
head(values)
totalprice yhat area e
1 228000 244517.2 75.31 -16517.209
2 409000 313055.6 100.65 95944.389
3 200000 281193.6 88.87 -81193.647
4 180000 210166.9 62.61 -30166.879
5 443600 436121.8 146.15 7478.238
6 173000 249656.2 77.21 -76656.240
totalprice on to area using the formulas below.\[b_1 = r\cdot\frac{s_y}{s_x}\] \[b_0 = \bar{y} - b_1\cdot\bar{x}\]
VIT2005 %>%
summarize(r = cor(totalprice, area),
b1 = r*sd(totalprice)/sd(area),
b0 = mean(totalprice) - b1*mean(area))
r b1 b0
1 0.8092125 2704.751 40822.42
mod1 using get_regression_table().CT <- get_regression_table(mod1)
CT
# A tibble: 2 x 7
term estimate std_error statistic p_value lower_ci upper_ci
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 intercept 40822. 12170. 3.35 0.001 16835. 64810.
2 area 2705. 134. 20.2 0 2441. 2968.
CT[1, 2]
# A tibble: 1 x 1
estimate
<dbl>
1 40822.
CT[2, 2]
# A tibble: 1 x 1
estimate
<dbl>
1 2705.
Create the linear model object mod2 by regressing totalprice on to elevatorF.
What totalprice do you predict for an apartment without an elevator and with an elevator (use the output from mod2 only).
Answer the previous question using group_by() and mean.
evals data set to create a parallel slopes model by regressing score on bty_avg and rank. Store the result in mod4.mod4 <- lm(score ~ bty_avg + rank, data = evals)
mod4
Call:
lm(formula = score ~ bty_avg + rank, data = evals)
Coefficients:
(Intercept) bty_avg ranktenure track ranktenured
3.98155 0.06783 -0.16070 -0.12623
Write out the least squares regression lines for the three ranks.
Graph mod4 with ggplot (parallel slopes model).
url <- "http://faculty.marshall.usc.edu/gareth-james/ISL/Advertising.csv"
if(!dir.exists("../Data/")){
dir.create("../Data/")
}
if(!file.exists("../Data/Advertising.csv")){
download.file(url, destfile = "../Data/Advertising.csv")}
Ad <- read.csv("../Data/Advertising.csv")
head(Ad)
X TV radio newspaper sales
1 1 230.1 37.8 69.2 22.1
2 2 44.5 39.3 45.1 10.4
3 3 17.2 45.9 69.3 9.3
4 4 151.5 41.3 58.5 18.5
5 5 180.8 10.8 58.4 12.9
6 6 8.7 48.9 75.0 7.2
# Want to create a 3D scatterplot of sales ~ TV + Radio
library(plotly)
p <- plot_ly(data = Ad, z = ~sales, x = ~TV, y = ~radio, opacity = 0.6) %>%
add_markers()
p
Next we want to add the best fit plane to the 3D scatterplot. Consider the best fit plane sales_mod:
sales_mod <- lm(sales ~TV + radio, data = Ad)
summary(sales_mod)
Call:
lm(formula = sales ~ TV + radio, data = Ad)
Residuals:
Min 1Q Median 3Q Max
-8.7977 -0.8752 0.2422 1.1708 2.8328
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.92110 0.29449 9.919 <2e-16 ***
TV 0.04575 0.00139 32.909 <2e-16 ***
radio 0.18799 0.00804 23.382 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.681 on 197 degrees of freedom
Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
x <- seq(0, 300, length = 70)
y <- seq(0, 50, length = 70)
plane <- outer(x, y, function(a, b){summary(sales_mod)$coef[1,1] +
summary(sales_mod)$coef[2,1]*a + summary(sales_mod)$coef[3,1]*b})
# draw the plane
p %>%
add_surface(x = ~x, y = ~y, z = ~plane, showscale = FALSE)
Consider a model with interaction:
int_mod <- lm(sales ~ TV + radio + TV:radio, data = Ad)
summary(int_mod)
Call:
lm(formula = sales ~ TV + radio + TV:radio, data = Ad)
Residuals:
Min 1Q Median 3Q Max
-6.3366 -0.4028 0.1831 0.5948 1.5246
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.750e+00 2.479e-01 27.233 <2e-16 ***
TV 1.910e-02 1.504e-03 12.699 <2e-16 ***
radio 2.886e-02 8.905e-03 3.241 0.0014 **
TV:radio 1.086e-03 5.242e-05 20.727 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9435 on 196 degrees of freedom
Multiple R-squared: 0.9678, Adjusted R-squared: 0.9673
F-statistic: 1963 on 3 and 196 DF, p-value: < 2.2e-16
plane_int <- outer(x, y, function(a, b){summary(int_mod)$coef[1,1] +
summary(int_mod)$coef[2,1]*a + summary(int_mod)$coef[3,1]*b +
summary(int_mod)$coef[4,1]*a*b})
p %>%
add_surface(x = ~x, y = ~y, z = ~plane_int, showscale = FALSE)
Create a 3D representation of the best fit plane for the debt_model in Section 6.2.2 of Modern Dive.
library(ISLR)
head(Credit)
ID Income Limit Rating Cards Age Education Gender Student Married Ethnicity
1 1 14.891 3606 283 2 34 11 Male No Yes Caucasian
2 2 106.025 6645 483 3 82 15 Female Yes Yes Asian
3 3 104.593 7075 514 4 71 11 Male No No Asian
4 4 148.924 9504 681 3 36 11 Female No No Asian
5 5 55.882 4897 357 2 68 16 Male No Yes Caucasian
6 6 80.180 8047 569 4 77 10 Male No No Caucasian
Balance
1 333
2 903
3 580
4 964
5 331
6 1151
credit_ch6 <- Credit %>%
as_tibble %>%
select(ID, debt = Balance, credit_limit = Limit, income = Income, credit_rating = Rating, age = Age)
glimpse(credit_ch6)
Rows: 400
Columns: 6
$ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
$ debt <int> 333, 903, 580, 964, 331, 1151, 203, 872, 279, 1350, 140…
$ credit_limit <int> 3606, 6645, 7075, 9504, 4897, 8047, 3388, 7114, 3300, 6…
$ income <dbl> 14.891, 106.025, 104.593, 148.924, 55.882, 80.180, 20.9…
$ credit_rating <int> 283, 483, 514, 681, 357, 569, 259, 512, 266, 491, 589, …
$ age <int> 34, 82, 71, 36, 68, 77, 37, 87, 66, 41, 30, 64, 57, 49,…
credit_ch6 %>%
select(debt, credit_limit, income) %>%
cor()
debt credit_limit income
debt 1.0000000 0.8616973 0.4636565
credit_limit 0.8616973 1.0000000 0.7920883
income 0.4636565 0.7920883 1.0000000
p1 <- ggplot(credit_ch6, aes(x = credit_limit, y = debt)) +
geom_point(color = "red") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw() +
labs(x = "Credit limit (in $)", y = "Credit card debt (in $)",
title = "Debt and credit limit")
p2 <- ggplot(data = credit_ch6, aes(x = income, y = debt)) +
geom_point(color = "red") +
geom_smooth(method = "lm", se = FALSE) +
theme_bw() +
labs(x = "Income (in $1000)", y = "Credit card debt (in $)",
title = "Debt and income")
library(gridExtra)
grid.arrange(p1, p2, ncol = 2)
# Create a categorical variable with four levels for credit_limit
credit_ch6 <- credit_ch6 %>%
mutate(credit_limit_cat = cut(credit_limit,
breaks = quantile(credit_limit, probs= c(0, .25, .5, .75, 1)), names = TRUE))
############
ggplot(data = credit_ch6, aes(x = income, y = debt, color = credit_limit_cat)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
theme_bw() +
labs(x = "Income (in $1000)", y = "Credit card debt (in $)",
title = "Debt and income")
mod <- lm(debt ~ credit_limit + income, data = credit_ch6)
summary(mod)
Call:
lm(formula = debt ~ credit_limit + income, data = credit_ch6)
Residuals:
Min 1Q Median 3Q Max
-232.79 -115.45 -48.20 53.36 549.77
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -385.17926 19.46480 -19.79 <2e-16 ***
credit_limit 0.26432 0.00588 44.95 <2e-16 ***
income -7.66332 0.38507 -19.90 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 165.5 on 397 degrees of freedom
Multiple R-squared: 0.8711, Adjusted R-squared: 0.8705
F-statistic: 1342 on 2 and 397 DF, p-value: < 2.2e-16
p <- plot_ly(data = credit_ch6, z = ~debt, x = ~credit_limit, y = ~income, opacity = 0.8) %>%
add_markers(marker = list(size = 8))
p
x <- seq(855, 13913, length = 70)
y <- seq(10, 187, length = 70)
plane <- outer(x, y, function(a, b){summary(mod)$coef[1,1] +
summary(mod)$coef[2,1]*a + summary(mod)$coef[3,1]*b})
# draw the plane
p %>%
add_surface(x = ~x, y = ~y, z = ~plane, showscale = FALSE)
int_mod2 <- lm(debt ~ credit_limit + income + credit_limit:income, data = credit_ch6)
summary(int_mod2)
Call:
lm(formula = debt ~ credit_limit + income + credit_limit:income,
data = credit_ch6)
Residuals:
Min 1Q Median 3Q Max
-211.53 -110.14 -42.20 49.65 561.27
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.071e+02 3.046e+01 -10.082 < 2e-16 ***
credit_limit 2.524e-01 6.839e-03 36.906 < 2e-16 ***
income -9.785e+00 7.461e-01 -13.115 < 2e-16 ***
credit_limit:income 2.672e-04 8.082e-05 3.306 0.00103 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 163.4 on 396 degrees of freedom
Multiple R-squared: 0.8746, Adjusted R-squared: 0.8736
F-statistic: 920.4 on 3 and 396 DF, p-value: < 2.2e-16
# int_plane2
int2_plane <- outer(x, y, function(a, b){summary(int_mod2)$coef[1,1] +
summary(int_mod2)$coef[2,1]*a + summary(int_mod2)$coef[3,1]*b + summary(int_mod2)$coef[4,1]*a*b})
# draw the plane
p %>%
add_surface(x = ~x, y = ~y, z = ~int2_plane, showscale = FALSE)